1 Settings

2 Purpose

  • Create a kind of template for the future analysis of regression problems, implementing ideas from the books ‘Applied Predictive Modeling’ and ‘Introduction to Statistical Learning’.

3 Review Data Set

3.2 Dimensions

dim(df)
## [1] 32 11

3.3 Structure

str(df)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...

3.4 Summary

summary(df)
##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000

3.5 Potential Outliers Based on Percentiles

lower_bound <- quantile(df$mpg, 0.025)
upper_bound <- quantile(df$mpg, 0.975)
outlier_df <- which(df$mpg < lower_bound | df$mpg > upper_bound)
remove(lower_bound)
remove(upper_bound)
df[outlier_df,]
##                 mpg cyl disp hp drat    wt qsec vs am gear carb
## Toyota Corolla 33.9   4 71.1 65 4.22 1.835 19.9  1  1    4    1
remove(outlier_df)

3.6 Remove Predictors with too many NAs (missing data)

df <- df[,colMeans(is.na(df)) < .9]
dim(df)
## [1] 32 11

3.7 Near Zero Variance Predictors

The identified near zero variance predictors are the following:

# create a zero variance variable for demonstration purposes
df$one <- 1
near_zero_vars <- nearZeroVar(df)
df[, near_zero_vars]
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

After the exclusion of near-zero-variance predictors the data set looks as follows:

df <- df[, -c(near_zero_vars)]
remove(near_zero_vars)
head(df)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

3.8 Reduce Collinearity

Collinearity is the situation where a pair of predictor variables have a substantial correlation with each other. In general, there are good reasons to avoid data with highly correlated predictors as it can result in highly unstable models and degraded predictive performance.

3.8.1 Plot Correlations

The darker areas in the correlation plot show variables which are correlated with each other.

# filter on numeric variables (in this case exclude 'mpg' as it represents the outcome, not a predictor)
predictors <- df[, -c(1)]
# iris[, !sapply(iris, is.numeric)]
# select non_numeric predictors, to be added back later
predictors_non_numeric <- predictors[, !sapply(predictors, is.numeric)]
predictors_numeric <- predictors[,sapply(predictors, is.numeric)]
correlations <- cor(predictors_numeric)
corrplot::corrplot(correlations, order = "hclust",tl.cex = 0.5)

3.8.2 Filter pairwise correlations

Removing following predictors:

highCorr <- findCorrelation(correlations, cutoff = 0.75)
remove(correlations)
head(predictors_numeric[highCorr])
##                   cyl disp gear
## Mazda RX4           6  160    4
## Mazda RX4 Wag       6  160    4
## Datsun 710          4  108    4
## Hornet 4 Drive      6  258    3
## Hornet Sportabout   8  360    3
## Valiant             6  225    3

3.8.3 Remaining predictors

predictors_numeric <- predictors_numeric[, -highCorr]
remove(highCorr)
names(predictors_numeric)
## [1] "hp"   "drat" "wt"   "qsec" "vs"   "am"   "carb"

3.8.4 Dataset after removal of predictors

df <- cbind(df[, names(df) %in% names(predictors_numeric)],
            subset(df, select = "mpg", drop = FALSE))
remove(predictors_numeric)
remove(predictors_non_numeric)
head(df)
##                    hp drat    wt  qsec vs am carb  mpg
## Mazda RX4         110 3.90 2.620 16.46  0  1    4 21.0
## Mazda RX4 Wag     110 3.90 2.875 17.02  0  1    4 21.0
## Datsun 710         93 3.85 2.320 18.61  1  1    1 22.8
## Hornet 4 Drive    110 3.08 3.215 19.44  1  0    1 21.4
## Hornet Sportabout 175 3.15 3.440 17.02  0  0    2 18.7
## Valiant           105 2.76 3.460 20.22  1  0    1 18.1

Dimension of dataset after removal of highly correlated predictors:

dim(df)
## [1] 32  8

Review correlation plot again after removal of correlated predictors (reduced collinearity):

correlations <- cor(df[, -c(8)])
corrplot::corrplot(correlations, order = "hclust",tl.cex = 0.5)

remove(correlations)
remove(predictors)

The darker areas should be reduced as a result of having removed correlated predictors.

3.9 EDA

3.9.1 Histogram

3.9.1.1 Base R

hist(x = df$mpg,
  xlab = "mpg",
  main = "Histogram of mpg",
  breaks = sqrt(nrow(df))
)

3.9.1.2 ggplot

ggplot(df) +
  aes(x = mpg) +
  geom_histogram(bins = 5L, fill = "#0c4c8a") +
  theme_minimal()

3.9.1.3 lattice

df_lattice <- df
# convert to factor
df_lattice$vs <- as.factor(ifelse(test = df_lattice$vs == 0, yes = "v_shaped", no = "straight"))
df_lattice$am <- as.factor(ifelse(test = df_lattice$am == 0, yes = "automatic", no = "manual"))
# plot
lattice::histogram(~mpg | am, data = df_lattice)

3.9.1.4 Lattice Density Plot

lattice::densityplot(~mpg | am, 
                     data = df_lattice, 
                     groups = vs, 
                     plot.points = FALSE, 
                     auto.key = TRUE)

3.9.2 Scatter Plot

3.9.2.1 Base R

plot(x = df$hp, y = df$mpg)
lm_mpg_hp <- lm(mpg ~ hp, data = df)
abline(lm_mpg_hp)

3.9.2.1.1 With Quadratic Element
# create quadratic linear model
lm_mpg_hp <- lm(mpg ~ hp + I(hp^2), data = df)
# sequence of x variable
newdat = data.frame(hp = seq(min(df$hp), max(df$hp), length.out = 100))
# predict
newdat$pred = predict(lm_mpg_hp, newdata = newdat)
remove(lm_mpg_hp)
plot(mpg ~ hp, data = df)
# plot prediction
lines(x = newdat$hp, y = newdat$pred)

remove(newdat)

3.9.2.2 ggplot

ggplot(data = df_lattice, aes(x = hp, y = mpg, shape = vs, colour = am, size = wt)) +
  geom_point()

3.9.2.3 ggplot - LM

ggplot(data = df_lattice, aes(x = hp, y = mpg)) +
        geom_point() +
        stat_smooth(method = lm, se = TRUE)
## `geom_smooth()` using formula = 'y ~ x'

3.9.2.4 pairs()

pairs(df)

3.9.2.5 caret::featurePlot - pairs

caret::featurePlot(x = df[, -5], 
            y = df$vs, 
            plot = "pairs")

3.9.2.6 ggpairs()

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(data = df,progress = FALSE)

3.9.2.7 lattice

xyplot(mpg ~ hp | am, 
       data = df_lattice, 
       main = "Lattie Scatter Plot in R", 
       type = c("p", "g", "smooth"))

3.9.2.8 featurePlot()

suppressWarnings(
featurePlot(x = df_lattice[, -c(5,6,8)], 
            y = df_lattice$mpg, 
            plot = "scatter",
            type = c("p", "g", "smooth"),
            layout = c(3, 2))
)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 3
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number -0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 3
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number -0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 3
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number -0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 3
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number -0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 3
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number -0

3.9.2.8.1 SPLOM
splom(df)

3.9.3 Line Graph

3.9.3.1 Base R

df_sorted_mpg <- df_lattice[order(df_lattice$mpg),]
plot(1:length(df_sorted_mpg$mpg), y = df_sorted_mpg$mpg)
lines(x = 1:length(df_sorted_mpg$mpg),
      y = df_sorted_mpg$mpg, 
      pch = 18, 
      col = "blue", 
      type = "b", 
      lty = 2)
# Add a legend to the plot
legend("topleft", 
       legend=c("Line 1"),
       col=c("blue"), 
       lty = 1:2, 
       cex=0.8)

3.9.3.2 ggplot

ggplot(data = df_sorted_mpg, aes(
        x = 1:length(df_sorted_mpg$mpg),
        y = mpg,
        colour = vs,
        # group = supp,
        # fill = xxx, 
        linetype = am
)) +
        geom_line() +
        ylim(0, max(df_sorted_mpg$mpg) * 1.1) +
        expand_limits(y = 0) +
        geom_point()

remove(df_sorted_mpg)

3.9.3.3 lattice

3.9.4 Box Plot

3.9.4.1 Base R

boxplot(df$mpg,
  ylab = "mpg"
)

3.9.4.2 ggplot

ggplot(df) +
  aes(x = "", y = mpg) +
  geom_boxplot(fill = "#0c4c8a") +
  theme_minimal()

Box Plot Outliers:

out <- boxplot.stats(df$mpg)$out
out_ind <- which(df$mpg %in% c(out))
remove(out)
df[out_ind, ]
## [1] hp   drat wt   qsec vs   am   carb mpg 
## <0 rows> (or 0-length row.names)
remove(out_ind)

3.9.4.3 lattice

bwplot(mpg ~ am | vs, df_lattice)

# remove(df_lattice)

4 Linear Regression (interpretable)

Start with all variables and exclude irrelevant ones.

lm_df <- lm(mpg ~ ., data = df[, -c(5,1,2,7,6)])
summary(lm_df)
## 
## Call:
## lm(formula = mpg ~ ., data = df[, -c(5, 1, 2, 7, 6)])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.3962 -2.1431 -0.2129  1.4915  5.7486 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  19.7462     5.2521   3.760 0.000765 ***
## wt           -5.0480     0.4840 -10.430 2.52e-11 ***
## qsec          0.9292     0.2650   3.506 0.001500 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.596 on 29 degrees of freedom
## Multiple R-squared:  0.8264, Adjusted R-squared:  0.8144 
## F-statistic: 69.03 on 2 and 29 DF,  p-value: 9.395e-12

4.1 Diagnostic Plots

par(mfrow=c(2,2)) # Change the panel layout to 2 x 2
plot(lm_df)

par(mfrow=c(1,1)) # Change back to 1 x 1

4.1.1 Interpretation of Diagnostic Plots

4.1.1.1 1. Residuals vs Fitted

This plot shows if residuals have non-linear patterns. There could be a non-linear relationship between predictor variables and an outcome variable and the pattern could show up in this plot if the model doesn’t capture the non-linear relationship. If you find equally spread residuals around a horizontal line without distinct patterns, that is a good indication you don’t have non-linear relationships.

4.1.1.2 2. Normal Q-Q

This plot shows if residuals are normally distributed. Do residuals follow a straight line well or do they deviate severely? It’s good if residuals are lined well on the straight dashed line.

4.1.1.3 3. Scale-Location

It’s also called Spread-Location plot. This plot shows if residuals are spread equally along the ranges of predictors. This is how you can check the assumption of equal variance (homoscedasticity). It’s good if you see a horizontal line with equally (randomly) spread points.

4.1.1.4 4. Residuals vs Leverage

This plot helps us to find influential cases if any. Not all outliers are influential in linear regression analysis (whatever outliers mean). Even though data have extreme values, they might not be influential to determine a regression line. That means, the results wouldn’t be much different if we either include or exclude them from analysis. They follow the trend in the majority of cases and they don’t really matter; they are not influential. On the other hand, some cases could be very influential even if they look to be within a reasonable range of the values. They could be extreme cases against a regression line and can alter the results if we exclude them from analysis. Another way to put it is that they don’t get along with the trend in the majority of the cases.

Unlike the other plots, this time patterns are not relevant. We watch out for outlying values at the upper right corner or at the lower right corner. Those spots are the places where cases can be influential against a regression line. Look for cases outside of a dashed line, Cook’s distance. When cases are outside of the Cook’s distance (meaning they have high Cook’s distance scores), the cases are influential to the regression results. The regression results will be altered if we exclude those cases.

4.2 Quadratic Linear Regression

lm_df <- lm(mpg ~ wt + I(wt^2) + qsec, data = df)
summary(lm_df)
## 
## Call:
## lm(formula = mpg ~ wt + I(wt^2) + qsec, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.2200 -1.2521 -0.6288  0.9357  5.1761 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  32.6418     5.6768   5.750 3.59e-06 ***
## wt          -12.4331     2.0842  -5.965 2.01e-06 ***
## I(wt^2)       1.0730     0.2970   3.613 0.001174 ** 
## qsec          0.8599     0.2236   3.846 0.000634 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.182 on 28 degrees of freedom
## Multiple R-squared:  0.8816, Adjusted R-squared:  0.8689 
## F-statistic:  69.5 on 3 and 28 DF,  p-value: 4.345e-13

4.2.1 Plot Prediction vs Actual

# create dataframe with predictions
# create x input values
pred_df <- data.frame(wt = seq(min(df$wt), max(df$wt), length.out = 100),
                      qsec = seq(min(df$qsec), max(df$qsec), length.out = 100))
# predict
pred_df$pred <- predict(lm_df, newdata = pred_df)
# plot actual data
plot(mpg ~ wt, data = df)
# plot prediction
# pred_df <- pred_df[order(pred_df$wt),]
lines(x = pred_df$wt, y = pred_df$pred, lwd = 1, lty = 2, col = c("blue"))
# Add a legend to the plot
legend("topleft", 
       legend=c("LM Prediction"),
       col=c("blue"), 
       lty = 2, 
       cex=0.8)

remove(pred_df)

4.3 Diagnostic Plots

par(mfrow=c(2,2)) # Change the panel layout to 2 x 2
plot(lm_df)

par(mfrow=c(1,1)) # Change back to 1 x 1
remove(lm_df)
remove(df_lattice)

Residuals vs Fitted plot is cleaner now after having considered the quadratic relationship.


5 Linear Regression (Caret)

5.1 Split Dataset

index_train <- caret::createDataPartition(df$mpg,
                                          p = 0.8,
                                          list = FALSE)
df_train <- df[index_train,]
df_test <- df[-index_train,]
remove(index_train)

5.2 Review Preprocessed: PCA

preproc_alg <- caret::preProcess(df_train,
                                 thresh = 0.95,
                                 method = c("BoxCox",
                                            "center",
                                            "scale",
                                            "pca"))
df_train_prpr <- stats::predict(preproc_alg, df_train)
head(df_train_prpr)
##                          PC1        PC2         PC3        PC4         PC5
## Mazda RX4 Wag     -0.1669268  1.7180059 -0.07991657 -0.4140913 -0.87960495
## Datsun 710        -2.4244826  0.2022331  0.47306897  0.9208873  0.05303057
## Hornet 4 Drive    -1.0122492 -1.9941515  0.51515573  0.3753043  0.36782282
## Hornet Sportabout  1.1130121 -0.4317811  0.83220698 -0.4993654  0.27773113
## Valiant           -0.5935989 -2.6417352  0.63921368  0.7373085 -0.11366940
## Duster 360         2.3883275  0.3207399  0.17311649 -0.1641141  0.60605433

5.2.1 Loadings

preproc_alg$rotation
##             PC1         PC2         PC3         PC4         PC5
## hp    0.4207703  0.13617289  0.01625092  0.31325119  0.35841135
## drat -0.3029680  0.43234519 -0.52876009 -0.16433618  0.30553159
## wt    0.3976895 -0.25688437 -0.26896143  0.15082129 -0.26674989
## qsec -0.2853227 -0.49852303 -0.34027042 -0.01154455 -0.45592925
## vs   -0.3795434 -0.23342279 -0.37562357  0.55737082  0.36320363
## am   -0.2292635  0.55509952  0.14188844  0.55764412 -0.54899896
## carb  0.3423247  0.32900243 -0.58867527 -0.23756272 -0.25358917
## mpg  -0.4216583  0.09568444  0.15575784 -0.41703926 -0.03924999
preproc_alg$std
##         hp       drat         wt       qsec         vs         am       carb 
## 1.34306317 0.09317022 0.43097395 0.10436273 0.50395263 0.49734746 0.61279798 
##        mpg 
## 0.30237150

5.2.2 Base R PCA Plots

what - the type of plot: “scree” produces a bar chart of standard deviations:

df_prcomp <- prcomp(df_train, center = TRUE,scale. = TRUE)
plot(df_prcomp)

5.2.3 Scree Plot II. - ggplot

#calculate total variance explained by each principal component
var_explained <- df_prcomp$sdev^2 / sum(df_prcomp$sdev^2)
#create scree plot
library(ggplot2)
qplot(c(1:length(var_explained)), var_explained) + 
  geom_line() + 
  xlab("Principal Component") + 
  ylab("Variance Explained") +
  ggtitle("Scree Plot") +
  ylim(0, 1)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.

remove(var_explained)

5.2.4 Scree Plot III. - caret-based

var_explained_caret <-
        sapply(df_train_prpr, sd) / sum(sapply(df_train_prpr, sd))
var_explained_caret
##        PC1        PC2        PC3        PC4        PC5 
## 0.42991713 0.27020519 0.12676675 0.09245660 0.08065433
sum(var_explained_caret)
## [1] 1
remove(var_explained_caret)

5.2.5 PCA plots

pcaCharts <- function(x) {
        x.var <- x$sdev ^ 2
        x.pvar <- x.var / sum(x.var)
        print("proportions of variance:")
        print(x.pvar)
        par(mfrow = c(2, 2))
        plot(
                x.pvar,
                xlab = "Principal component",
                ylab = "Proportion of variance explained",
                ylim = c(0, 1),
                type = 'b'
        )
        plot(
                cumsum(x.pvar),
                xlab = "Principal component",
                ylab = "Cumulative Proportion of variance explained",
                ylim = c(0, 1),
                type = 'b'
        )
        screeplot(x)
        screeplot(x, type = "l")
        par(mfrow = c(1, 1))
}

pcaCharts(df_prcomp)
## [1] "proportions of variance:"
## [1] 0.59745505 0.25590584 0.05587410 0.02655431 0.02405089 0.01947729 0.01223229
## [8] 0.00845021

remove(df_prcomp)
remove(pcaCharts)

5.2.6 SPLOM PCAs (first 3)

df_train$vs_factor <- as.factor(ifelse(df_train$vs == 0, "v_shaped", "straight"))
panelRange <- extendrange(df_train_prpr[, 1:3])
library(ellipse)
## 
## Attaching package: 'ellipse'
## The following object is masked from 'package:graphics':
## 
##     pairs
upperp <- function(...)
  {
    args <- list(...)
    circ1 <- ellipse(diag(rep(1, 2)), t = 1)
    panel.xyplot(circ1[,1], circ1[,2],
                 type = "l",
                 lty = trellis.par.get("reference.line")$lty,
                 col = trellis.par.get("reference.line")$col,
                 lwd = trellis.par.get("reference.line")$lwd)
    circ2 <- ellipse(diag(rep(1, 2)), t = 2)
    panel.xyplot(circ2[,1], circ2[,2],
                 type = "l",
                 lty = trellis.par.get("reference.line")$lty,
                 col = trellis.par.get("reference.line")$col,
                 lwd = trellis.par.get("reference.line")$lwd)
    circ3 <- ellipse(diag(rep(1, 2)), t = 3)
    panel.xyplot(circ3[,1], circ3[,2],
                 type = "l",
                 lty = trellis.par.get("reference.line")$lty,
                 col = trellis.par.get("reference.line")$col,
                 lwd = trellis.par.get("reference.line")$lwd)
    panel.xyplot(args$x, args$y, groups = args$groups, subscripts = args$subscripts)
  }
splom(as.data.frame(df_train_prpr[, 1:3]),
      groups = df_train$vs_factor,
      type = c("p", "g"),
      as.table = TRUE,
            lower.panel = function(...){}, 
      upper.panel = upperp,
      auto.key = list(columns = 2),
      prepanel.limits = function(x) panelRange)

remove(panelRange)
df_train$vs_factor <- NULL

5.3 trainControl() Settings

trainControl()$method
## [1] "boot"
trainControl()$number
## [1] 25
trainControl()$repeats
## [1] NA
trainControl()$p
## [1] 0.75

5.4 Preprocess and Train

df_train_sq <- df_train
df_train_sq$wt2 <- df_train_sq$wt^2

# check CPU in terminal: 'lscpu'
cl <- makePSOCKcluster(5)
registerDoParallel(cl)

lm_model_caret <- caret::train(
        mpg ~ .,
        data = df_train_sq,
        preProc = c("BoxCox", "center", "scale", "pca"),
        method = "lm"
)

stopCluster(cl)
remove(cl)
lm_model_caret
## Linear Regression 
## 
## 28 samples
##  8 predictor
## 
## Pre-processing: Box-Cox transformation (6), centered (8), scaled (8),
##  principal component signal extraction (8) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 28, 28, 28, 28, 28, 28, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   2.650173  0.8446887  2.168273
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

5.5 Predict

# insert quadratic element in df_test (in line with df_train)
df_test$wt2 <- df_test$wt^2
# Apply the transformations:
predicted <- predict(lm_model_caret, df_test)
df_test$observed <- df_test$mpg
df_test$predicted <- round(predicted, 1)
df_test$residual <- round(df_test$predicted - df_test$observed, 1)
df_test[, c("predicted", "observed", "residual")]
##              predicted observed residual
## Mazda RX4         21.7     21.0      0.7
## Merc 280C         20.0     17.8      2.2
## AMC Javelin       17.9     15.2      2.7
## Lotus Europa      27.8     30.4     -2.6

5.6 Results

5.6.1 Training Set

caret::R2(pred = predict(lm_model_caret, df_train_sq), 
          obs = df_train_sq$mpg)
## [1] 0.8804422
caret::RMSE(pred = predict(lm_model_caret, df_train_sq), 
          obs = df_train_sq$mpg)
## [1] 2.05518

5.6.2 Test Set

caret::R2(pred = df_test$predicted, 
          obs = df_test$observed)
## [1] 0.9980506
caret::RMSE(pred = df_test$predicted, 
            obs = df_test$observed)
## [1] 2.201136

5.7 Compare Prediction vs Observed

5.7.1 Plot Predicted vs Observed

plot(x = df_test$predicted,
     y = df_test$observed)
abline(a=0, b=1)

5.7.2 Plot Using Model

plot(x = predict(lm_model_caret, df_test),
     y = df_test$observed)
abline(a=0, b=1)

5.7.3 ggplot

ggplot(df_test, aes(x = predict(lm_model_caret, df_test), y = df_test$observed)) +
        geom_point() +
        geom_abline(intercept = 0, slope = 1) +
        labs(x = 'Predicted Values', y = 'Actual Values', title = 'Predicted vs. Actual Values')